#!/usr/bin/env python
# -*- coding: utf-8 -*-

from numpy import *


def load_data_set():
    """
    返回训练数据集和对应的标签向量
    :return:
    """
    posting_list = [['my', 'dog', 'has', 'flea', 'problem', 'help', 'please', ],
                    ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                    ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                    ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                    ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                    ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    class_vector = [0, 1, 0, 1, 0, 1]
    return posting_list, class_vector


def create_vocabulary_list(data_set):
    """
    根据输入的训练数据集，创建词向量，并去重
    :param data_set:
    :return:
    """
    vocabulary_set = set([])
    for document in data_set:
        vocabulary_set = vocabulary_set | set(document)
    return list(vocabulary_set)


def word_set_to_vector(vocabulary_list, input_set):
    """
    将输入的训练数据转换为词向量
    :param vocabulary_list:
    :param input_set:
    :return:
    """
    return_vector = [0] * len(vocabulary_list)
    for word in input_set:
        if word in vocabulary_list:
            return_vector[vocabulary_list.index(word)] = 1
        else:
            print("the word: %s is not in my vocabulary!" % word)
    return return_vector


def train_nb0(train_matrix, train_category):
    """
    朴素贝叶斯分类器训练函数
    :param train_matrix:
    :param train_category:
    :return:
    """
    train_doc_num = len(train_matrix)
    word_num = len(train_matrix[0])
    p_abusive = sum(train_category) / float(train_doc_num)
    p0_num = zeros(word_num)
    p1_num = zeros(word_num)
    p0_denom = 0.0
    p1_denom = 0.0
    for i in range(train_doc_num):
        if train_category[i] == 1:
            p1_num += train_matrix[i]
            p1_denom += sum(train_matrix[i])
        else:
            p0_num += train_matrix[i]
            p0_denom += sum(train_matrix[i])
    p1_vector = p1_num / p1_denom
    p0_vector = p0_num / p0_denom
    return p0_vector, p1_vector, p_abusive


if __name__ == "__main__":
    # 创建训练数据集和对应的标签向量，再根据训练数据集创建词向量
    # posts_list, classes_list = load_data_set()
    # my_vocabulary_list = create_vocabulary_list(posts_list)
    # print(my_vocabulary_list)

    # 将一个训练数据转换为词向量
    # posts_list, classes_list = load_data_set()
    # my_vocabulary_list = create_vocabulary_list(posts_list)
    # vector = word_set_to_vector(my_vocabulary_list, posts_list[0])
    # print(vector)

    # 计算概率
    posts_list, classes_list = load_data_set()
    my_vocabulary_list = create_vocabulary_list(posts_list)
    train_mat = []
    for doc_in_posts in posts_list:
        train_mat.append(word_set_to_vector(my_vocabulary_list, doc_in_posts))
    p0_vector, p1_vector, p_abusive = train_nb0(train_mat, classes_list)
    print(p_abusive)
